3  PBMC GSEA Example

3.1 Running logistic SuSiE on the PBMC data

source('notebooks/pbmcs/functions.R')
reticulate::use_virtualenv('gibss')
data <- prep_pbmc_data()
#' function for getting the gene set data
load_regression_data <- function(data, celltype, quantile, db_name){
  db <- WebGestaltR::loadGeneSet(enrichDatabase = db_name)
  data %>%
    make_gene_list(celltype, q) %>%
    gseasusie:::prepare_data()
}

#' driver function for running logistic SuSiE on pbmc data + webgestalt gene sets
driver <- function(celltype, q, db, path){
  print("I'm driving here!")
  data %>%
    make_gene_list(celltype, q) %>%
    {gseasusie::fit_gsea_susie_webgestalt(
      .$gene_list, 
      .$gene_background, 
      db,
      tol=1e-5,
      maxiter=100,
      alpha=0.5,
      gamma=0.0
    )} %>%
    save_model(path=path)
  return(T)
}

Here is an example of how to use the driver function.

# the driver function fits the model and saves it, just returns true
driver(
  'CD14+ Monocyte', 
  0.1, 
  'geneontology_Biological_Process',
  'notebooks/pbmcs/results/cd14plusmonocyte/geneontology_Biological_Process/q=0.1.rds'
)
# load the results. Contains $fit, $data, $time
res <- readRDS('notebooks/pbmcs/results/cd14plusmonocyte/geneontology_Biological_Process/q=0.1.rds')

Now we run logistic SuSiE using the top 10% of genes in each celltype. We rerun the analysis for the gene ontology subsets available through the WebGestaltR package. WebGestaltR offers various subsets of the GO, e.g. geneontology_Biological_Process, and an abridged version geneontology_Biological_Process_noRedundant.

celltypes <- names(data)
celltypes_sanitized <- c('cd14plusmonocyte', 'cd19plusb', 'cd34plus', 'cd56plusnk', 'tcell')
quantiles <- c(0.1)
db <- c(
  'geneontology_Biological_Process', 
  'geneontology_Biological_Process_noRedundant', 
  'geneontology_Cellular_Component', 
  'geneontology_Cellular_Component_noRedundant', 
  'geneontology_Molecular_Function', 
  'geneontology_Molecular_Function_noRedundant'
)

config <- tidyr::crossing(
  tibble(celltype = celltypes, celltype_sanitized=celltypes_sanitized),
  q=quantiles,
  db = db
)
make_path <- function(celltype_sanitized, db, q){
  glue::glue('notebooks/pbmcs/results/{celltype_sanitized}/{db}/q={q}.rds')
}

config %>%
  dplyr::rowwise() %>%
  dplyr::mutate(path = make_path(celltype_sanitized, db, q)) %>%
  dplyr::mutate(run=driver(celltype, q, db, path))
make_cs_tbl_single <- function(fit, l){
  cs <- fit$fit$credible_sets[[l]]
  tidyr::as_tibble(cs) %>%
    dplyr::mutate(
      geneSetIdx = cs + 1, 
      component = paste0('L', l),
      lbf_ser = fit$fit$lbf_ser[[l]]
    ) %>%
    dplyr::select(-cs) %>%
    dplyr::left_join(fit$data$geneSets) %>%
    dplyr::group_by(component, geneSet) %>%
    dplyr::mutate(
      geneSetSize = n(),
      propInList = mean(geneInList)
    ) %>%
    dplyr::ungroup()
}

make_cs_tbl <- function(fit, min_lbf_ser=log(10.)){
  # include components with large enough lbf_ser
  include_components <- which(fit$fit$lbf_ser > min_lbf_ser)
  purrr::map_dfr(include_components, ~make_cs_tbl_single(fit, .x))
}

make_component_tbl_single <- function(fit, l){
  # component, gene set, alpha, beta, lbf, prior_variance
  with(fit$fit, tibble(
    component=glue::glue('L{l}'), 
    geneSet = fit$data$geneSetMapping$geneSet, 
    alpha=alpha[l,],
    beta=beta[l,], 
    lbf=lbf[l,], 
    prior_variance=prior_variance[l]))
}

make_component_tbl <- function(fit, min_lbf_ser = log(10)){
  include_components <- which(fit$fit$lbf_ser > min_lbf_ser)
  purrr::map_dfr(include_components, ~make_component_tbl_single(fit, .x))
}
db = 'geneontology_Biological_Process'
q = 0.1
celltype = 'CD19+ B'
celltypes_sanitized_list <- as.list(celltypes_sanitized)
names(celltypes_sanitized_list) <- celltype

load_model <- function(celltype, q, db){
  # load fit model
  path <- glue::glue('notebooks/pbmcs/results/{celltypes_sanitized_list[[celltype]]}/{db}/q={q}.rds')
  message(glue::glue('loading fit from: {path}'))
  fit <- readRDS(path)
  return(fit)
}

load_model2 <- function(path){
  # load fit model
  message(glue::glue('loading fit from: {path}'))
  fit <- readRDS(path)
  return(fit)
}

make_cs_tbl_nested <- function(fit){
  more_go_info <- AnnotationDbi::select(GO.db::GO.db,
         keys = unique(fit$data$geneSets$geneSet),
         columns = c('TERM', 'DEFINITION'),
         keytype = 'GOID') %>% 
    as_tibble() %>%
    dplyr::mutate(geneSet=GOID)
  
  more_gene_info <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
        keys = unique(fit$data$geneMapping$gene),
        columns = c('SYMBOL', 'GENETYPE', 'GENENAME'),
        keytype = 'ENTREZID') %>%
    as_tibble() %>%
    dplyr::mutate(gene = ENTREZID)
  
  gene_columns <- c('ENTREZID', 'SYMBOL', 'GENENAME', 'geneInList')
  gene_set_columns <- c('geneSet', 'TERM', 'DEFINITION', 'alpha', 'beta', 'lbf', 'geneSetSize', 'propInList')
  component_columns <- c('component', 'size', 'coverage', 'target_coverage', 'lbf_ser', 'prior_variance')
  
  all_columns <- c(component_columns, gene_set_columns, gene_columns)
  
  message('building nested credible set table')
  fit %>%
    make_cs_tbl() %>%
    left_join(make_component_tbl(fit)) %>% # add effect estimates, etc.
    left_join(more_go_info) %>%
    left_join(more_gene_info) %>%
    dplyr::select(all_columns) %>%
    # nest gene level data
    nest(.by= c(component_columns, gene_set_columns), .key='details') %>%
    # nest gene set level data
    nest(.by= component_columns, .key='details')
}

cs_tbl_nested <- load_model(celltype, q, db) %>% 
  make_cs_tbl_nested()
nested_summary <- config %>%
  dplyr::rowwise() %>%
  dplyr::mutate(path = make_path(celltype_sanitized, db, q)) %>%
  ungroup() %>%
  dplyr::mutate(nested_table = purrr::map(path, ~make_cs_tbl_nested(readRDS(.x))))
nested_summary %>%
  saveRDS('notebooks/pbmcs/results/nested_table.rds')

3.2 A table of PBMC results on GO

source('notebooks/pbmcs/functions.R')
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- readRDS('notebooks/pbmcs/results/nested_table.rds') %>%
  dplyr::select(-path) %>%
  dplyr::rename(details = nested_table)


reactable(
  dplyr::select(data, -c(celltype_sanitized, details)),
  details = make_get_details(data))